{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Network Graph\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Basic setup\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 268,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pyvis.network import Network\n",
    "import networkx as nx\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Loading previously calculated dataframes\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 269,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(359, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>entity</th>\n",
       "      <th>importance</th>\n",
       "      <th>category</th>\n",
       "      <th>chunk_id</th>\n",
       "      <th>type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>interoperability</td>\n",
       "      <td>5</td>\n",
       "      <td>concept</td>\n",
       "      <td>bb894d4a987241c6b7827d4acabac6dc</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>introduction</td>\n",
       "      <td>3</td>\n",
       "      <td>event</td>\n",
       "      <td>bb894d4a987241c6b7827d4acabac6dc</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>unique charging port design</td>\n",
       "      <td>4</td>\n",
       "      <td>misc</td>\n",
       "      <td>ebcfb9727a42482d986fa2b0a64fa8ac</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>usb c</td>\n",
       "      <td>5</td>\n",
       "      <td>object</td>\n",
       "      <td>ebcfb9727a42482d986fa2b0a64fa8ac</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mobile phone model</td>\n",
       "      <td>3</td>\n",
       "      <td>object</td>\n",
       "      <td>ebcfb9727a42482d986fa2b0a64fa8ac</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        entity  importance category  \\\n",
       "0             interoperability           5  concept   \n",
       "1                 introduction           3    event   \n",
       "2  unique charging port design           4     misc   \n",
       "3                        usb c           5   object   \n",
       "4           mobile phone model           3   object   \n",
       "\n",
       "                           chunk_id     type  \n",
       "0  bb894d4a987241c6b7827d4acabac6dc  concept  \n",
       "1  bb894d4a987241c6b7827d4acabac6dc  concept  \n",
       "2  ebcfb9727a42482d986fa2b0a64fa8ac  concept  \n",
       "3  ebcfb9727a42482d986fa2b0a64fa8ac  concept  \n",
       "4  ebcfb9727a42482d986fa2b0a64fa8ac  concept  "
      ]
     },
     "execution_count": 269,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataframe_dir = \"MediumArticles\"\n",
    "# df = pd.read_csv(f\"./data_output/{dataframe_dir}/chunks.csv\", sep=\"|\")\n",
    "df_concepts = pd.read_csv(f\"./data_output/{dataframe_dir}/concepts.csv\", sep=\"|\")\n",
    "print(df_concepts.shape)\n",
    "df_concepts.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate Graph Dataframe\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Graph dataframe is a dataframe where every row is a connection between two nodes.\n",
    "\n",
    "It is basically an inner self join of the nodes dataframe\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of nodes =  2570\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>entity_L</th>\n",
       "      <th>importance_L</th>\n",
       "      <th>category_L</th>\n",
       "      <th>chunk_id</th>\n",
       "      <th>type_L</th>\n",
       "      <th>entity_R</th>\n",
       "      <th>importance_R</th>\n",
       "      <th>category_R</th>\n",
       "      <th>type_R</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>interoperability</td>\n",
       "      <td>5</td>\n",
       "      <td>concept</td>\n",
       "      <td>bb894d4a987241c6b7827d4acabac6dc</td>\n",
       "      <td>concept</td>\n",
       "      <td>introduction</td>\n",
       "      <td>3</td>\n",
       "      <td>event</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>introduction</td>\n",
       "      <td>3</td>\n",
       "      <td>event</td>\n",
       "      <td>bb894d4a987241c6b7827d4acabac6dc</td>\n",
       "      <td>concept</td>\n",
       "      <td>interoperability</td>\n",
       "      <td>5</td>\n",
       "      <td>concept</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>unique charging port design</td>\n",
       "      <td>4</td>\n",
       "      <td>misc</td>\n",
       "      <td>ebcfb9727a42482d986fa2b0a64fa8ac</td>\n",
       "      <td>concept</td>\n",
       "      <td>usb c</td>\n",
       "      <td>5</td>\n",
       "      <td>object</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>unique charging port design</td>\n",
       "      <td>4</td>\n",
       "      <td>misc</td>\n",
       "      <td>ebcfb9727a42482d986fa2b0a64fa8ac</td>\n",
       "      <td>concept</td>\n",
       "      <td>mobile phone model</td>\n",
       "      <td>3</td>\n",
       "      <td>object</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7</td>\n",
       "      <td>unique charging port design</td>\n",
       "      <td>4</td>\n",
       "      <td>misc</td>\n",
       "      <td>ebcfb9727a42482d986fa2b0a64fa8ac</td>\n",
       "      <td>concept</td>\n",
       "      <td>phone companies</td>\n",
       "      <td>4</td>\n",
       "      <td>organisation</td>\n",
       "      <td>concept</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index                     entity_L  importance_L category_L  \\\n",
       "0      1             interoperability             5    concept   \n",
       "1      2                 introduction             3      event   \n",
       "2      5  unique charging port design             4       misc   \n",
       "3      6  unique charging port design             4       misc   \n",
       "4      7  unique charging port design             4       misc   \n",
       "\n",
       "                           chunk_id   type_L            entity_R  \\\n",
       "0  bb894d4a987241c6b7827d4acabac6dc  concept        introduction   \n",
       "1  bb894d4a987241c6b7827d4acabac6dc  concept    interoperability   \n",
       "2  ebcfb9727a42482d986fa2b0a64fa8ac  concept               usb c   \n",
       "3  ebcfb9727a42482d986fa2b0a64fa8ac  concept  mobile phone model   \n",
       "4  ebcfb9727a42482d986fa2b0a64fa8ac  concept     phone companies   \n",
       "\n",
       "   importance_R    category_R   type_R  \n",
       "0             3         event  concept  \n",
       "1             5       concept  concept  \n",
       "2             5        object  concept  \n",
       "3             3        object  concept  \n",
       "4             4  organisation  concept  "
      ]
     },
     "execution_count": 240,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfne_join = pd.merge(\n",
    "    df_concepts, df_concepts, how=\"inner\", on=\"chunk_id\", suffixes=(\"_L\", \"_R\")\n",
    ")\n",
    "\n",
    "## Remove self Loops\n",
    "self_loops_drop = dfne_join[dfne_join[\"entity_L\"] == dfne_join[\"entity_R\"]].index\n",
    "dfg = dfne_join.drop(index=self_loops_drop).reset_index()\n",
    "\n",
    "## This is our graph dataframe\n",
    "print(\"Total number of nodes = \", dfg.shape[0])\n",
    "dfg.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Clean the graph dataframe\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The original graph dataframe is too big to visualise. So we will make another dataframe for visualisation purpose.\n",
    "\n",
    "-   remove the less important nodes\n",
    "-   remove less important edges\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Less important Nodes =  29 \n",
      "Less Important Edges =  2\n"
     ]
    }
   ],
   "source": [
    "## Drop nodes which are less important\n",
    "less_important_nodes = dfg[(dfg[\"importance_L\"] < 2)].index\n",
    "## Drop edges where both the nodes are less important than 5\n",
    "less_important_edges = dfg[(dfg[\"importance_L\"] < 2) & (dfg[\"importance_R\"] < 2)].index\n",
    "drops = less_important_nodes.union(less_important_edges)\n",
    "\n",
    "print(\n",
    "    \"Less important Nodes = \",\n",
    "    less_important_nodes.shape[0],\n",
    "    \"\\nLess Important Edges = \",\n",
    "    less_important_edges.shape[0],\n",
    ")\n",
    "\n",
    "## Remove these rows from the graph dataframe\n",
    "dfg_vis = dfg.drop(index=drops).reset_index()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combine similar edges\n",
    "\n",
    "Group the edges between the same nodes and combine them into single edge with its weight equal to the count. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final Number of Edges in the Visualisation Graph =  2481\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>entity_L</th>\n",
       "      <th>entity_R</th>\n",
       "      <th>importance_L</th>\n",
       "      <th>importance_R</th>\n",
       "      <th>chunks</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>35200</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>35209</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>99def</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>adt_a01</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>al</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            entity_L entity_R  importance_L  importance_R  \\\n",
       "0  10000 w 100th ave    35200           2.0           2.0   \n",
       "1  10000 w 100th ave    35209           2.0           2.0   \n",
       "2  10000 w 100th ave    99def           2.0           2.0   \n",
       "3  10000 w 100th ave  adt_a01           2.0           2.0   \n",
       "4  10000 w 100th ave       al           2.0           2.0   \n",
       "\n",
       "                             chunks  count  \n",
       "0  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "1  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "2  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "3  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "4  45a6201014f84cc7bcb7aea24a7f7af6      1  "
      ]
     },
     "execution_count": 260,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "## Group and aggregate edges.\n",
    "dfg_vis = (\n",
    "    dfg_vis.groupby([\"entity_L\", \"entity_R\"])\n",
    "    .agg(\n",
    "        {\n",
    "            \"importance_L\": \"mean\",\n",
    "            \"importance_R\": \"mean\",\n",
    "            \"chunk_id\": [\",\".join, \"count\"],\n",
    "        }\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "dfg_vis.columns = [\n",
    "    \"entity_L\",\n",
    "    \"entity_R\",\n",
    "    \"importance_L\",\n",
    "    \"importance_R\",\n",
    "    \"chunks\",\n",
    "    \"count\",\n",
    "]\n",
    "\n",
    "print(\"Final Number of Edges in the Visualisation Graph = \", dfg_vis.shape[0])\n",
    "dfg_vis.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Removing overconnected nodes\n",
    "\n",
    "These are featured in the header and the footer of the pdf file, so they are a little too connected.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final Number of Edges  =  2481 \n",
      "Dropped edges: 0\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>entity_L</th>\n",
       "      <th>entity_R</th>\n",
       "      <th>importance_L</th>\n",
       "      <th>importance_R</th>\n",
       "      <th>chunks</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>35200</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>35209</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>99def</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>adt_a01</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>al</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>45a6201014f84cc7bcb7aea24a7f7af6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            entity_L entity_R  importance_L  importance_R  \\\n",
       "0  10000 w 100th ave    35200           2.0           2.0   \n",
       "1  10000 w 100th ave    35209           2.0           2.0   \n",
       "2  10000 w 100th ave    99def           2.0           2.0   \n",
       "3  10000 w 100th ave  adt_a01           2.0           2.0   \n",
       "4  10000 w 100th ave       al           2.0           2.0   \n",
       "\n",
       "                             chunks  count  \n",
       "0  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "1  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "2  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "3  45a6201014f84cc7bcb7aea24a7f7af6      1  \n",
       "4  45a6201014f84cc7bcb7aea24a7f7af6      1  "
      ]
     },
     "execution_count": 261,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ind = dfg_vis[\n",
    "    dfg_vis[\"entity_L\"].isin(\n",
    "        [\"Pathways to Health Equity for the G20\", \"Accelerating Global Health\"]\n",
    "    )\n",
    "    | dfg_vis[\"entity_R\"].isin(\n",
    "        [\"Pathways to Health Equity for the G20\", \"Accelerating Global Health\"]\n",
    "    )\n",
    "].index\n",
    "dfg_vis.drop(index=ind, axis=1, inplace=True)\n",
    "print(\"Final Number of Edges  = \", dfg_vis.shape[0], \"\\nDropped edges:\", len(ind))\n",
    "dfg_vis.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating a NetworkX Graph\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate nodes\n",
    "\n",
    "Here I am grouping the graph dataframe by left node and calculating the mean importance. This way we will end up with only the unique nodes from the graph dataframe along with their weights.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>entity_L</th>\n",
       "      <th>importance_L</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2010</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>30-year-old standard</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35200</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35209</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               entity_L  importance_L\n",
       "0     10000 w 100th ave           2.0\n",
       "1                  2010           4.0\n",
       "2  30-year-old standard           4.0\n",
       "3                 35200           2.0\n",
       "4                 35209           2.0"
      ]
     },
     "execution_count": 262,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# nodes = df_graph[\"entity_L\"].unique()\n",
    "nodes = dfg_vis.groupby([\"entity_L\"]).agg({\"importance_L\": \"mean\"}).reset_index()\n",
    "nodes.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build a NetworkX object with nodes and edges\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = nx.Graph()\n",
    "for index, row in nodes.iterrows():\n",
    "    G.add_node(row[\"entity_L\"])\n",
    "\n",
    "for index, row in dfg_vis.iterrows():\n",
    "    G.add_edge(str(row[\"entity_L\"]), str(row[\"entity_R\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Community Detection\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Detect communities using the Girvan Newman algorithm\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Communities =  27\n"
     ]
    }
   ],
   "source": [
    "communities_generator = nx.community.girvan_newman(G)\n",
    "top_level_communities = next(communities_generator)\n",
    "next_level_communities = next(communities_generator)\n",
    "communities = sorted(map(sorted, next_level_communities))\n",
    "print(\"Number of Communities = \", len(communities))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add colors to nodes based on community\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>entity_L</th>\n",
       "      <th>importance_L</th>\n",
       "      <th>color</th>\n",
       "      <th>group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10000 w 100th ave</td>\n",
       "      <td>2.0</td>\n",
       "      <td>#db576c</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2010</td>\n",
       "      <td>4.0</td>\n",
       "      <td>#db578a</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>30-year-old standard</td>\n",
       "      <td>4.0</td>\n",
       "      <td>#db57a7</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35200</td>\n",
       "      <td>2.0</td>\n",
       "      <td>#db576c</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35209</td>\n",
       "      <td>2.0</td>\n",
       "      <td>#db576c</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               entity_L  importance_L    color  group\n",
       "0     10000 w 100th ave           2.0  #db576c      1\n",
       "1                  2010           4.0  #db578a      2\n",
       "2  30-year-old standard           4.0  #db57a7      3\n",
       "3                 35200           2.0  #db576c      1\n",
       "4                 35209           2.0  #db576c      1"
      ]
     },
     "execution_count": 265,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "palette = \"hls\"\n",
    "\n",
    "\n",
    "## Now add these colors to communities and make another dataframe\n",
    "def colors2Community(communities) -> pd.DataFrame:\n",
    "    ## Define a color palette\n",
    "    p = sns.color_palette(palette, len(communities)).as_hex()\n",
    "    rows = []\n",
    "    group = 0\n",
    "    for community in communities:\n",
    "        color = p.pop()\n",
    "        group += 1\n",
    "        for node in community:\n",
    "            rows += [{\"entity_L\": node, \"color\": color, \"group\": group}]\n",
    "    df_colors = pd.DataFrame(rows)\n",
    "    return df_colors\n",
    "\n",
    "\n",
    "colors = colors2Community(communities)\n",
    "\n",
    "df_nodes_colors = pd.merge(\n",
    "    nodes, colors, how=\"left\", on=\"entity_L\", suffixes=(\"_N\", \"_C\")\n",
    ")\n",
    "# nodes.head()\n",
    "df_nodes_colors.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So now we have a nodes dataframe with colors and sizes of each node.\n",
    "\n",
    "lets recreate our graph.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = nx.Graph()\n",
    "node_size_multiple = 6\n",
    "\n",
    "for index, row in df_nodes_colors.iterrows():\n",
    "    G.add_node(\n",
    "        row[\"entity_L\"],\n",
    "        size=row[\"importance_L\"] * node_size_multiple,\n",
    "        title=row[\"entity_L\"],\n",
    "        color=row[\"color\"],\n",
    "    )\n",
    "\n",
    "for index, row in dfg_vis.iterrows():\n",
    "    G.add_edge(\n",
    "        str(row[\"entity_L\"]),\n",
    "        str(row[\"entity_R\"]),\n",
    "        weight=row[\"count\"],\n",
    "        name=row[\"chunks\"],\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualisation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./docs/index.html\n"
     ]
    }
   ],
   "source": [
    "graph_output_directory = \"./docs/index.html\"\n",
    "\n",
    "net = Network(\n",
    "    notebook=False,\n",
    "    bgcolor=\"#1a1a1a\",\n",
    "    cdn_resources=\"remote\",\n",
    "    height=\"900px\",\n",
    "    width=\"100%\",\n",
    "    select_menu=True,\n",
    "    font_color=\"#cccccc\",\n",
    "    # filter_menu=True,\n",
    ")\n",
    "\n",
    "net.from_nx(G)\n",
    "net.repulsion(node_distance=150, spring_length=400)\n",
    "# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)\n",
    "net.show_buttons(filter_=[\"physics\"])\n",
    "\n",
    "net.show(graph_output_directory, notebook=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "OpenAI@3111",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
